#!/bin/sh
#
# iosnoop - A program to print disk I/O events as they happen, with useful
#           details such as UID, PID, filename (if available), command, etc.
#           Written using DTrace (Solaris 10 3/05, MacOS X 10.5).
#
# This is measuring events that have made it past system caches, such as
# disk events for local file systems, and network events for remote
# filesystems (such as NFS.)
#
# $Id: iosnoop 79 2011-11-14 05:49:44Z brendan $
#
# USAGE: 	iosnoop [-a|-A|-DeghiNostv] [-d device] [-f filename] 
#			[-m mount_point] [-n name] [-p PID]
#
#		iosnoop		# default output
#
#		-a		# print all data (mostly)
#		-A		# dump all data, space delimited
#		-D		# print time delta, us (elapsed)
#		-e		# print device name
#		-g		# print command arguments
#		-i		# print device instance
#		-N		# print major and minor numbers
#		-o		# print disk delta time, us 
#		-s		# print start time, us
#		-t 		# print completion time, us
#		-v		# print completion time, string
#		-d device	# instance name to snoop (eg, dad0)
#		-f filename	# full pathname of file to snoop
#		-m mount_point	# this FS only (will skip raw events)
#		-n name		# this process name only
#		-p PID		# this PID only
#  eg,
#		iosnoop -v	# human readable timestamps
#		iosnoop -N	# print major and minor numbers
#		iosnoop -m /	# snoop events on the root filesystem only
# 	
# FIELDS:
#		UID		user ID
#		PID		process ID
#		PPID		parennt process ID
#		COMM		command name for the process
#		ARGS		argument listing for the process
#		SIZE		size of operation, bytes
#		BLOCK		disk block for the operation (location)
#		STIME	 	timestamp for the disk request, us
#		TIME		timestamp for the disk completion, us
#		DELTA		elapsed time from request to completion, us
#		DTIME		time for disk to complete request, us
#		STRTIME		timestamp for the disk completion, string
#		DEVICE  	device name
#		INS     	device instance number
#		D		direction, Read or Write
#		MOUNT		mount point
#		FILE		filename (basename) for io operation
# 
# NOTE:
# - There are two different delta times reported. -D prints the
#   elapsed time from the disk request (strategy) to the disk completion
#   (iodone); -o prints the time for the disk to complete that event 
#   since it's last event (time between iodones), or, the time to the 
#   strategy if the disk had been idle. 
# - When filtering on PID or process name, be aware that poor disk event
#   times may be due to events that have been filtered away, for example
#   another process that may be seeking the disk heads elsewhere.
#
# SEE ALSO: BigAdmin: DTrace, http://www.sun.com/bigadmin/content/dtrace
#	    Solaris Dynamic Tracing Guide, http://docs.sun.com
#	    DTrace Tools, http://www.brendangregg.com/dtrace.html
#
# COPYRIGHT: Copyright (c) 2009 Brendan Gregg.
#
# CDDL HEADER START
#
#  The contents of this file are subject to the terms of the
#  Common Development and Distribution License, Version 1.0 only
#  (the "License").  You may not use this file except in compliance
#  with the License.
#
#  You can obtain a copy of the license at Docs/cddl1.txt
#  or http://www.opensolaris.org/os/licensing.
#  See the License for the specific language governing permissions
#  and limitations under the License.
#
# CDDL HEADER END
#
# 12-Mar-2004	Brendan Gregg	Created this, build 51.
# 23-May-2004	   "	  "	Fixed mntpt bug.
# 10-Oct-2004	   "      "	Rewritten to use the io provider, build 63.
# 04-Jan-2005	   "	  "	Wrapped in sh to provide options.
# 08-May-2005	   "      "	Rewritten for perfromance.
# 15-Jul-2005	   "      "	Improved DTIME calculation.
# 25-Jul-2005	   "      "	Added -p, -n. Improved code.
# 17-Sep-2005	   "      "	Increased switchrate.
# 15-Sep-2009	   "      "	Removed genunix for both MacOS X and NFS.
#


##############################
# --- Process Arguments ---
#

### default variables
opt_dump=0; opt_device=0; opt_delta=0; opt_devname=0; opt_file=0; opt_args=0; 
opt_mount=0; opt_start=0 opt_end=0; opt_endstr=0; opt_ins=0; opt_nums=0
opt_dtime=0; filter=0; device=.; filename=.; mount=.; pname=.; pid=0
opt_name=0; opt_pid=0

### process options
while getopts aAd:Def:ghim:Nn:op:stv name
do
	case $name in
	a)	opt_devname=1; opt_args=1; opt_endstr=1; opt_nums=1 ;;
	A)	opt_dump=1 ;;
	d)	opt_device=1; device=$OPTARG ;;
	D)	opt_delta=1 ;;
	e)	opt_devname=1 ;;
	f)	opt_file=1; filename=$OPTARG ;;
	g)	opt_args=1 ;;
	i)	opt_ins=1 ;;
	N)	opt_nums=1 ;;
	n)	opt_name=1; pname=$OPTARG ;;
	o)	opt_dtime=1 ;;
	p)	opt_pid=1; pid=$OPTARG ;;
	m)	opt_mount=1; mount=$OPTARG ;;
	s)	opt_start=1 ;;
	t)	opt_end=1 ;;
	v)	opt_endstr=1 ;;
	h|?)	cat <<-END >&2
		USAGE: iosnoop [-a|-A|-DeghiNostv] [-d device] [-f filename]
		               [-m mount_point] [-n name] [-p PID]
		       iosnoop          # default output
		                -a      # print all data (mostly)
		                -A      # dump all data, space delimited
		                -D      # print time delta, us (elapsed)
		                -e      # print device name
		                -g      # print command arguments
		                -i      # print device instance
		                -N      # print major and minor numbers
		                -o      # print disk delta time, us
		                -s      # print start time, us
		                -t      # print completion time, us
		                -v      # print completion time, string
		                -d device       # instance name to snoop 
		                -f filename     # snoop this file only
		                -m mount_point  # this FS only 
		                -n name         # this process name only 
		                -p PID          # this PID only 
		   eg,
		        iosnoop -v      # human readable timestamps
		        iosnoop -N      # print major and minor numbers
		        iosnoop -m /    # snoop events on filesystem / only
		END
		exit 1
	esac
done

### option logic
if [ $opt_dump -eq 1 ]; then
	opt_delta=0; opt_devname=0; opt_args=2; opt_start=0; 
	opt_end=0; opt_endstr=0; opt_nums=0; opt_ins=0; opt_dtime=0
fi
if [ $opt_device -eq 1 -o $opt_file -eq 1 -o $opt_mount -eq 1 -o \
    $opt_name -eq 1 -o $opt_pid -eq 1 ]; then
	filter=1
fi


#################################
# --- Main Program, DTrace ---
#
/usr/sbin/dtrace -n '
 /*
  * Command line arguments
  */
 inline int OPT_dump 	= '$opt_dump';
 inline int OPT_device 	= '$opt_device';
 inline int OPT_delta 	= '$opt_delta';
 inline int OPT_devname = '$opt_devname';
 inline int OPT_file 	= '$opt_file';
 inline int OPT_args 	= '$opt_args';
 inline int OPT_ins  	= '$opt_ins';
 inline int OPT_nums  	= '$opt_nums';
 inline int OPT_dtime 	= '$opt_dtime';
 inline int OPT_mount 	= '$opt_mount';
 inline int OPT_start 	= '$opt_start';
 inline int OPT_pid 	= '$opt_pid';
 inline int OPT_name 	= '$opt_name';
 inline int OPT_end 	= '$opt_end';
 inline int OPT_endstr 	= '$opt_endstr';
 inline int FILTER 	= '$filter';
 inline int PID 	= '$pid';
 inline string DEVICE 	= "'$device'";
 inline string FILENAME = "'$filename'";
 inline string MOUNT 	= "'$mount'";
 inline string NAME 	= "'$pname'";
 
 #pragma D option quiet
 #pragma D option switchrate=10hz

 /* boost the following if you get "dynamic variable drops" */
 #pragma D option dynvarsize=16m

 /*
  * Print header
  */
 dtrace:::BEGIN 
 {
	last_event[""] = 0;

	/* print optional headers */
 	OPT_start   ? printf("%-14s ","STIME(us)")   : 1;
 	OPT_end     ? printf("%-14s ","TIME(us)")    : 1;
 	OPT_endstr  ? printf("%-20s ","STRTIME") : 1;
 	OPT_devname ? printf("%-7s ","DEVICE")   : 1;
 	OPT_ins     ? printf("%-3s ","INS")      : 1;
 	OPT_nums    ? printf("%-3s %-3s ","MAJ","MIN") : 1;
 	OPT_delta   ? printf("%-10s ","DELTA(us)")   : 1;
 	OPT_dtime   ? printf("%-10s ","DTIME(us)")   : 1;

	/* print main headers */
	OPT_dump ? 
	    printf("%s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s\n",
 	    "TIME", "STIME", "DELTA", "DEVICE", "INS", "MAJ", "MIN", "UID",
	    "PID", "PPID", "D", "BLOCK", "SIZE", "MOUNT", "FILE", "PATH",
	    "COMM","ARGS") :
 	    printf("%5s %5s %1s %8s %6s ", "UID", "PID", "D", "BLOCK", "SIZE");
	OPT_args == 0 ? printf("%10s %s\n", "COMM", "PATHNAME") : 1;
 	OPT_args == 1 ? printf("%28s %s\n", "PATHNAME", "ARGS") : 1;
 }

 /*
  * Check event is being traced
  */
 io:::start
 { 
	/* default is to trace unless filtering, */
	self->ok = FILTER ? 0 : 1;

	/* check each filter, */
	(OPT_device == 1 && DEVICE == args[1]->dev_statname)? self->ok = 1 : 1;
	(OPT_file == 1 && FILENAME == args[2]->fi_pathname) ? self->ok = 1 : 1;
	(OPT_mount == 1 && MOUNT == args[2]->fi_mount) ? self->ok = 1 : 1;
	(OPT_name == 1 && NAME == execname) ? self->ok = 1 : 1;
	(OPT_pid == 1 && PID == pid) ? self->ok = 1 : 1;
 }

 /*
  * Reset last_event for disk idle -> start
  * this prevents idle time being counted as disk time.
  */
 io:::start
 /! pending[args[1]->dev_statname]/
 {
	/* save last disk event */
	last_event[args[1]->dev_statname] = timestamp;
 }

 /*
  * Store entry details
  */
 io:::start
 /self->ok/
 {
	/* these are used as a unique disk event key, */
 	this->dev = args[0]->b_edev;
 	this->blk = args[0]->b_blkno;

	/* save disk event details, */
 	start_uid[this->dev, this->blk] = uid;
 	start_pid[this->dev, this->blk] = pid;
 	start_ppid[this->dev, this->blk] = ppid;
 	start_args[this->dev, this->blk] = (char *)curpsinfo->pr_psargs;
 	start_comm[this->dev, this->blk] = execname;
 	start_time[this->dev, this->blk] = timestamp;

	/* increase disk event pending count */
	pending[args[1]->dev_statname]++;

	self->ok = 0;
 }

 /*
  * Process and Print completion
  */
 io:::done
 /start_time[args[0]->b_edev, args[0]->b_blkno]/
 {
	/* decrease disk event pending count */
	pending[args[1]->dev_statname]--;

	/*
	 * Process details
	 */

 	/* fetch entry values */
 	this->dev = args[0]->b_edev;
 	this->blk = args[0]->b_blkno;
 	this->suid = start_uid[this->dev, this->blk];
 	this->spid = start_pid[this->dev, this->blk];
 	this->sppid = start_ppid[this->dev, this->blk];
 	self->sargs = (int)start_args[this->dev, this->blk] == 0 ? 
 	    "" : start_args[this->dev, this->blk];
 	self->scomm = start_comm[this->dev, this->blk];
 	this->stime = start_time[this->dev, this->blk];
	this->etime = timestamp; /* endtime */
	this->delta = this->etime - this->stime;
	this->dtime = last_event[args[1]->dev_statname] == 0 ? 0 :
	    timestamp - last_event[args[1]->dev_statname];

 	/* memory cleanup */
 	start_uid[this->dev, this->blk]  = 0;
 	start_pid[this->dev, this->blk]  = 0;
 	start_ppid[this->dev, this->blk] = 0;
 	start_args[this->dev, this->blk] = 0;
 	start_time[this->dev, this->blk] = 0;
 	start_comm[this->dev, this->blk] = 0;
 	start_rw[this->dev, this->blk]   = 0;

	/*
	 * Print details
	 */

	/* print optional fields */
 	OPT_start   ? printf("%-14d ", this->stime/1000) : 1;
 	OPT_end     ? printf("%-14d ", this->etime/1000) : 1;
 	OPT_endstr  ? printf("%-20Y ", walltimestamp) : 1;
 	OPT_devname ? printf("%-7s ", args[1]->dev_statname) : 1;
 	OPT_ins     ? printf("%3d ", args[1]->dev_instance) : 1;
 	OPT_nums    ? printf("%3d %3d ",
	    args[1]->dev_major, args[1]->dev_minor) : 1;
 	OPT_delta   ? printf("%-10d ", this->delta/1000) : 1;
 	OPT_dtime   ? printf("%-10d ", this->dtime/1000) : 1;

	/* print main fields */
	OPT_dump ? 
 	    printf("%d %d %d %s %d %d %d %d %d %d %s %d %d %s %s %s %s %S\n",
 	    this->etime/1000, this->stime/1000, this->delta/1000,
 	    args[1]->dev_statname, args[1]->dev_instance, args[1]->dev_major,
	    args[1]->dev_minor, this->suid, this->spid, this->sppid, 
	    args[0]->b_flags & B_READ ? "R" : "W", 
	    args[0]->b_blkno, args[0]->b_bcount, args[2]->fi_mount,
	    args[2]->fi_name, args[2]->fi_pathname, self->scomm, self->sargs) :
 	    printf("%5d %5d %1s %8d %6d ",
 	    this->suid, this->spid, args[0]->b_flags & B_READ ? "R" : "W",
	    args[0]->b_blkno, args[0]->b_bcount);
	OPT_args == 0 ? printf("%10s %s\n", self->scomm, args[2]->fi_pathname)
	    : 1;
 	OPT_args == 1 ? printf("%28s %S\n",
	    args[2]->fi_pathname, self->sargs) : 1;

	/* save last disk event */
	last_event[args[1]->dev_statname] = timestamp;

	/* cleanup */
	self->scomm = 0;
	self->sargs = 0;
 }

 /*
  * Prevent pending from underflowing
  * this can happen if this program is started during disk events.
  */
 io:::done
 /pending[args[1]->dev_statname] < 0/
 {
	pending[args[1]->dev_statname] = 0;
 }
'
